# -*-coding:utf-8-*-
__author__ = 'Administrator'

from bs4 import BeautifulSoup
import urllib
import urllib2
import re
import urlparse
import os
#URL管理器
old_url = set()
new_url = set()

def url_manage(root_url):

    while True:

        links =  parser(root_url)
        for link in links:
            if link['href'] in new_url:
                continue

            url =  'http://www.5iweb.com.cn' + link['href']
            new_url.add(link['href'])
            m = re.match(r'.+\.zip', link['href'])
            print url
            if m:
                file_name = os.path.split(link['href'])[1]
                urllib.urlretrieve(url,file_name )
                print os.path.split(link['href'])[1]
                #print m.string



            try:
                url_manage( url )
            except:
                continue
            #如果是新的，进入页面 并寻找 zip地址

        break

    return root_url


#下载程序
def down_zip():
    pass


#解析器
def parser(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    content = response.read()
    soup  = BeautifulSoup(content,'html.parser',from_encoding='utf-8')
    links = soup.find_all("a",href=re.compile(r"^/.+"))
    return links


#初始化 程序
root_url = 'http://www.5iweb.com.cn'
url_manage(root_url)


